{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<i>Copyright (c) Microsoft Corporation. All rights reserved.</i>\n",
    "\n",
    "<i>Licensed under the MIT License.</i>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Item2Item recommendations with DKN \n",
    "The second task is about knowledge-aware item-to-item recommendations. We still use DKN for demonstration. \n",
    "The learning framework is illustrated as follows:\n",
    "<img src=\"https://recodatasets.z20.web.core.windows.net/kdd2020/images/Item2item-framework.JPG\"  width=\"500\">"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append(\"../../../\")\n",
    "from reco_utils.recommender.deeprec.deeprec_utils import *\n",
    "from reco_utils.recommender.deeprec.models.dkn_item2item import *\n",
    "from reco_utils.recommender.deeprec.io.dkn_item2item_iterator import *\n",
    "import time\n",
    "\n",
    "import tensorflow as tf\n",
    "tf.logging.set_verbosity(tf.logging.ERROR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "data_path = 'data_folder/my/DKN-training-folder'\n",
    "yaml_file = './dkn.yaml' #os.path.join(data_path, r'../../../../../../dkn.yaml')\n",
    "train_file = os.path.join(data_path, r'item2item_train_instances.txt')\n",
    "valid_file = os.path.join(data_path, r'item2item_valid_instances.txt')\n",
    "news_feature_file = os.path.join(data_path, r'../paper_feature.txt')\n",
    "wordEmb_file = os.path.join(data_path, r'word_embedding.npy')\n",
    "entityEmb_file = os.path.join(data_path, r'entity_embedding.npy')\n",
    "contextEmb_file = os.path.join(data_path, r'context_embedding.npy')\n",
    "infer_embedding_file = os.path.join(data_path, r'infer_embedding_item2item.txt')\n",
    "news_feature_file = os.path.join(data_path,  r'../paper_feature.txt')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<bound method HParams.values of HParams([('DNN_FIELD_NUM', None), ('EARLY_STOP', 100), ('FEATURE_COUNT', None), ('FIELD_COUNT', None), ('L', None), ('MODEL_DIR', 'data_folder/my/DKN-training-folder/save_models'), ('PAIR_NUM', None), ('SUMMARIES_DIR', None), ('T', None), ('activation', ['sigmoid']), ('att_fcn_layer_sizes', None), ('attention_activation', 'relu'), ('attention_dropout', 0.0), ('attention_layer_sizes', 32), ('attention_size', None), ('batch_size', 32), ('cate_embedding_dim', None), ('cate_vocab', None), ('contextEmb_file', 'data_folder/my/DKN-training-folder/context_embedding.npy'), ('cross_activation', 'identity'), ('cross_l1', 0.0), ('cross_l2', 0.0), ('cross_layer_sizes', None), ('cross_layers', None), ('data_format', 'dkn'), ('decay', None), ('dilations', None), ('dim', 32), ('doc_size', 15), ('dropout', [0.0]), ('dtype', 32), ('embed_l1', 0.0), ('embed_l2', 0.0), ('embed_size', None), ('embedding_dropout', 0.3), ('enable_BN', False), ('entityEmb_file', 'data_folder/my/DKN-training-folder/entity_embedding.npy'), ('entity_dim', 32), ('entity_embedding_method', 'TransE'), ('entity_size', 57267), ('epochs', 10), ('eval_epoch', None), ('fast_CIN_d', 0), ('filter_sizes', [1, 2, 3]), ('hidden_size', None), ('history_size', None), ('init_method', 'uniform'), ('init_value', 0.01), ('is_clip_norm', True), ('item_embedding_dim', None), ('item_vocab', None), ('iterator_type', None), ('kernel_size', None), ('kg_file', None), ('kg_training_interval', 5), ('layer_l1', 0.0), ('layer_l2', 0.0), ('layer_sizes', [300]), ('learning_rate', 0.0002), ('load_model_name', None), ('load_saved_model', False), ('loss', 'log_loss'), ('lr_kg', 0.5), ('lr_rs', 1), ('max_grad_norm', 0.5), ('max_seq_length', None), ('method', 'classification'), ('metrics', ['auc']), ('min_seq_length', 1), ('model_type', 'dkn'), ('mu', None), ('n_h', None), ('n_item', None), ('n_item_attr', None), ('n_layers', None), ('n_user', None), ('n_user_attr', None), ('n_v', None), ('need_sample', True), ('news_feature_file', 'data_folder/my/DKN-training-folder/../paper_feature.txt'), ('num_filters', 50), ('optimizer', 'adam'), ('pairwise_metrics', ['group_auc', 'mean_mrr', 'ndcg@2;4;6']), ('reg_kg', 0.0), ('save_epoch', 1), ('save_model', True), ('show_step', 10000), ('top_k', None), ('train_num_ngs', 4), ('train_ratio', None), ('transform', True), ('use_CIN_part', False), ('use_DNN_part', False), ('use_FM_part', False), ('use_Linear_part', False), ('use_context', True), ('use_entity', True), ('user_clicks', None), ('user_dropout', False), ('user_embedding_dim', None), ('user_history_file', None), ('user_vocab', None), ('wordEmb_file', 'data_folder/my/DKN-training-folder/word_embedding.npy'), ('word_size', 194755), ('write_tfevents', False)])>\n"
     ]
    }
   ],
   "source": [
    "epoch = 10\n",
    "hparams = prepare_hparams(yaml_file,\n",
    "                          news_feature_file=news_feature_file,\n",
    "                          wordEmb_file=wordEmb_file,\n",
    "                          entityEmb_file=entityEmb_file,\n",
    "                          contextEmb_file=contextEmb_file,\n",
    "                          epochs=epoch,\n",
    "                          is_clip_norm=True,\n",
    "                          max_grad_norm=0.5,\n",
    "                          his_size=20,\n",
    "                          MODEL_DIR=os.path.join(data_path, 'save_models'),\n",
    "                          learning_rate=0.0002,\n",
    "                          embed_l2=0.0,\n",
    "                          layer_l2=0.0,\n",
    "                          batch_size=32,\n",
    "                          use_entity=True,\n",
    "                          use_context=True\n",
    "                          )\n",
    "print(hparams.values)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To build an item2item recommendation model based on the Recommender repo, you only need to modify two files: \n",
    "1. Data Loader :  dkn_item2item_iterator.py\n",
    "2. Model : dkn_item2item.py\n",
    "\n",
    "<img src=\"https://recodatasets.z20.web.core.windows.net/kdd2020/images%2Fcode-changed-item2item.JPG\" width=\"700\">"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "input_creator = DKNItem2itemTextIterator\n",
    "hparams.neg_num=9"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "A special parameter is `neg_num`. It indicates how many negative instances exist in a group for softmax computation.\n",
    "Training and validation instances are organized as follows: \n",
    "\n",
    "<img src=\"https://recodatasets.z20.web.core.windows.net/kdd2020/images/item2item-instances.JPG\" width=\"700\">\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = DKNItem2Item(hparams, input_creator)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'group_auc': 0.8558, 'mean_mrr': 0.6916, 'ndcg@2': 0.6443, 'ndcg@4': 0.7192, 'ndcg@6': 0.7482}\n",
      "0.9248842358589172\n"
     ]
    }
   ],
   "source": [
    "t01 = time.time()\n",
    "print(model.run_eval(valid_file))\n",
    "t02 = time.time()\n",
    "print((t02 - t01) / 60)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "at epoch 1\n",
      "train info: logloss loss:50.36440721690707\n",
      "eval info: group_auc:0.9498, mean_mrr:0.8773, ndcg@2:0.8712, ndcg@4:0.8951, ndcg@6:0.901\n",
      "at epoch 1 , train time: 41.6 eval time: 54.5\n",
      "at epoch 2\n",
      "train info: logloss loss:47.04455384063052\n",
      "eval info: group_auc:0.9514, mean_mrr:0.8879, ndcg@2:0.8827, ndcg@4:0.9027, ndcg@6:0.9078\n",
      "at epoch 2 , train time: 40.5 eval time: 54.5\n",
      "at epoch 3\n",
      "train info: logloss loss:45.84113100337731\n",
      "eval info: group_auc:0.9526, mean_mrr:0.8934, ndcg@2:0.8885, ndcg@4:0.9065, ndcg@6:0.9118\n",
      "at epoch 3 , train time: 40.7 eval time: 55.0\n",
      "at epoch 4\n",
      "train info: logloss loss:45.158868913040365\n",
      "eval info: group_auc:0.9539, mean_mrr:0.8974, ndcg@2:0.8932, ndcg@4:0.9099, ndcg@6:0.9148\n",
      "at epoch 4 , train time: 40.1 eval time: 54.2\n",
      "at epoch 5\n",
      "train info: logloss loss:44.69093512051153\n",
      "eval info: group_auc:0.9551, mean_mrr:0.9007, ndcg@2:0.8967, ndcg@4:0.9126, ndcg@6:0.9173\n",
      "at epoch 5 , train time: 40.1 eval time: 54.2\n",
      "at epoch 6\n",
      "train info: logloss loss:44.35335900643538\n",
      "eval info: group_auc:0.9556, mean_mrr:0.9023, ndcg@2:0.898, ndcg@4:0.9139, ndcg@6:0.9188\n",
      "at epoch 6 , train time: 40.2 eval time: 54.0\n",
      "at epoch 7\n",
      "train info: logloss loss:44.07531811900561\n",
      "eval info: group_auc:0.9557, mean_mrr:0.9032, ndcg@2:0.8991, ndcg@4:0.9149, ndcg@6:0.9192\n",
      "at epoch 7 , train time: 40.4 eval time: 54.4\n",
      "at epoch 8\n",
      "train info: logloss loss:43.86925596192048\n",
      "eval info: group_auc:0.9549, mean_mrr:0.9024, ndcg@2:0.8982, ndcg@4:0.9138, ndcg@6:0.9183\n",
      "at epoch 8 , train time: 40.3 eval time: 54.4\n",
      "at epoch 9\n",
      "train info: logloss loss:43.69110510823603\n",
      "eval info: group_auc:0.9545, mean_mrr:0.9024, ndcg@2:0.8982, ndcg@4:0.9137, ndcg@6:0.918\n",
      "at epoch 9 , train time: 40.2 eval time: 54.2\n",
      "at epoch 10\n",
      "train info: logloss loss:43.547584555088484\n",
      "eval info: group_auc:0.9559, mean_mrr:0.9045, ndcg@2:0.9012, ndcg@4:0.9156, ndcg@6:0.92\n",
      "at epoch 10 , train time: 40.1 eval time: 54.2\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<reco_utils.recommender.deeprec.models.dkn_item2item.DKNItem2Item at 0x7f54e400ba58>"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.fit(train_file, valid_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<reco_utils.recommender.deeprec.models.dkn_item2item.DKNItem2Item at 0x7f54e400ba58>"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.run_get_embedding(news_feature_file, infer_embedding_file)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Again, we compre with DKN performance between using knowledge entities or without using knowledge entities (DKN(-)):\n",
    "\n",
    "| Models | Group-AUC | MRR |NDCG@2 | NDCG@4 |\n",
    "| :------| :------: | :------: | :------: | :------ |\n",
    "| DKN | 0.9557 | 0.8993 | 0.8951 | 0.9123 |\n",
    "| DKN(-) | 0.9506 | 0.8817 | 0.8758 | 0.8982 |\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "reco_gpu",
   "language": "python",
   "name": "reco_gpu"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
